by Andrew Trask
In [1]:
def pretty_print_review_and_label(i):
print(labels[i] + "\t:\t" + reviews[i][:80] + "...")
g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()
g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()
In [2]:
len(reviews)
Out[2]:
In [3]:
reviews[0]
Out[3]:
In [4]:
labels[0]
Out[4]:
In [5]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(12816)
pretty_print_review_and_label(6267)
pretty_print_review_and_label(21934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)
In [6]:
from collections import Counter
import numpy as np
In [7]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
In [8]:
for i in range(len(reviews)):
if(labels[i] == 'POSITIVE'):
for word in reviews[i].split(" "):
positive_counts[word] += 1
total_counts[word] += 1
else:
for word in reviews[i].split(" "):
negative_counts[word] += 1
total_counts[word] += 1
In [9]:
positive_counts.most_common()[:15]
Out[9]:
In [10]:
pos_neg_ratios = Counter()
for term,cnt in list(total_counts.most_common()):
if(cnt > 100):
pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
pos_neg_ratios[term] = pos_neg_ratio
for word,ratio in pos_neg_ratios.most_common():
if(ratio > 1):
pos_neg_ratios[word] = np.log(ratio)
else:
pos_neg_ratios[word] = -np.log((1 / (ratio+0.01)))
In [11]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common()[:15]
Out[11]:
In [12]:
# words most frequently seen in a review with a "NEGATIVE" label
list(reversed(pos_neg_ratios.most_common()))[0:30]
Out[12]:
In [13]:
from IPython.display import Image
review = "This was a horrible, terrible movie."
Image(filename='sentiment_network.png')
Out[13]:
In [14]:
review = "The movie was excellent"
Image(filename='sentiment_network_pos.png')
Out[14]:
In [15]:
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print(vocab_size)
In [16]:
list(vocab)[:15]
Out[16]:
In [17]:
import numpy as np
layer_0 = np.zeros((1, vocab_size))
layer_0
Out[17]:
In [18]:
from IPython.display import Image
Image(filename='sentiment_network.png')
Out[18]:
In [25]:
word2index = {}
for i, word in enumerate(vocab):
word2index[word] = i
word2index_sample = {k: word2index[k] for k in list(word2index.keys())[:15]}
word2index_sample
Out[25]:
In [26]:
def update_input_layer(review):
global layer_0
layer_0 *= 0
for word in review.split(" "):
layer_0[0][word2index[word]] += 1
update_input_layer(reviews[0])
In [27]:
layer_0
Out[27]:
In [28]:
def get_target_for_label(label):
if (label == 'POSITIVE'):
return 1
else:
return 0
In [29]:
labels[0]
Out[29]:
In [30]:
get_target_for_label(labels[0])
Out[30]:
In [31]:
labels[1]
Out[31]:
In [32]:
get_target_for_label(labels[1])
Out[32]:
In [ ]: